﻿#coding = utf-8

'''
程 序 名：墨迹天气爬虫
编 写 人：bxgj
运行环境：win7x64 Python3.6.4
修改日志：2018.09.09 新建
          2018.09.10 完成爬取主要天气信息功能
          2018.09.12 完成爬取风力等信息的功能
          2018.09.14 完成爬取空气质量信息的功能
          2018.09.15 重构代码
          2018.09.16 完成数据保存功能
          2018.09.21 功能基本完成，修复部分细节问题
版    本：V1.2
备    注：由于最近没有极端天气，所以爬取高温预警、雷电预警之类的功能暂未完成

空气质量评定标准，AQI分级
优 0-50
良 51-100
轻度污染 101-150
中度污染 151-200
重度污染 201-300
严重污染 301-500
500以上爆表了


'''

# ---------------- 导入模块 ----------------
from bs4 import BeautifulSoup
import requests
import time
import datetime
import os
import re
import csv


# ---------------- 全局变量、初始化等 ----------------

# 要爬取的链接
wea_url = "https://tianqi.moji.com/weather/china/shaanxi/yanta-district"
aqi_url = "https://tianqi.moji.com/aqi/china/shaanxi/yanta-district"

'''
墨迹天气官方网站
https://tianqi.moji.com/

链接格式
天气预报 https://tianqi.moji.com/weather/china/省会/市、区、县等
空气质量 https://tianqi.moji.com/aqi/china/省会/市、区、县等

地级市，直接用拼音，如
https://tianqi.moji.com/weather/china/shaanxi/xian

市区，区名的拼音加-district，如
https://tianqi.moji.com/weather/china/shaanxi/yanta-district

县，县名的拼音加-county
镇，镇名的拼音加-town

其他的地名，如XX山、XX风景区、XX湖基本都是拼音加英文

特殊地名特殊处理，如
秦始皇陵
mausoleum-of-the-first-qin-emperor
陕西历史博物馆
shanxi-history-museum

'''


# 是否使用调试模式
# DEBUG_MODE = True
DEBUG_MODE = False


# ---------------- 类定义 ----------------

# 实时天气信息及预报天气信息
class weather():
    def __init__(self, uptime=None, city="", weather="", temp=None, 
                humi=None, wind_dir="", wind_min=None, wind_max=None, tips="", description="",
                aqi=None, aqi_level="", aqi_PM10=None, aqi_PM2P5=None, aqi_NO2=None, aqi_SO2=None, aqi_O3=None, aqi_CO=None,
                other_info=""):

        # 爬取时间
        crawltime = datetime.datetime.now()
        self.crawltime = crawltime.replace(microsecond=0)

        # 天气基本信息
        self.uptime = uptime # 这个时间指的是网站上显示的更新时间，不是爬取的时间
        self.city = city
        self.weather = weather
        self.temp = temp

        # 天气详细信息
        self.humi = humi
        self.wind_dir = wind_dir
        self.wind_min = wind_min
        self.wind_max = wind_max
        self.tips = tips
        self.description = description# 总体天气描述

        # 空气质量
        self.aqi = aqi
        self.aqi_level = aqi_level
        self.aqi_PM10 = aqi_PM10
        self.aqi_PM2P5 = aqi_PM2P5
        self.aqi_NO2 = aqi_NO2
        self.aqi_SO2 = aqi_SO2
        self.aqi_O3 = aqi_O3
        self.aqi_CO = aqi_CO

        # 天气其他信息，比如高温预警、雷电预警之类的
        self.other_info = other_info

    # 便于保存成文件什么的
    def get_weather_info_list(self):
        # return [self.datetime, self.city, self.weather, self.temp,
        return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp,
        self.humi, self.wind_dir, self.wind_min, self.wind_max, self.tips, self.description,
        self.aqi, self.aqi_level, self.aqi_PM10, self.aqi_PM2P5, self.aqi_NO2, self.aqi_SO2,
        self.aqi_O3, self.aqi_CO, self.other_info]

    def get_weather_info_str(self):
        return [str(self.crawltime), str(self.uptime), str(self.city), str(self.weather), str(self.temp),
        str(self.humi), self.wind_dir, str(self.wind_min), str(self.wind_max), str(self.tips), str(self.description),
        str(self.aqi), self.aqi_level, str(self.aqi_PM10), str(self.aqi_PM2P5),
        str(self.aqi_NO2), str(self.aqi_SO2), str(self.aqi_O3), str(self.aqi_CO),
        str(self.other_info)]
    
    #全部转换成纯字符串
    def __str__(self):
        return "|".join(self.get_weather_info_str()).replace("None", "")


# 天气预报信息，比较简略
class weather_forecast():
    def __init__(self, uptime=None, city="", weather="", temp_min=None, temp_max=None, 
                wind_dir="", wind_min=None, wind_max=None, aqi=None, aqi_level=None, other_info=""):

        # 爬取时间
        crawltime = datetime.datetime.now()
        self.crawltime = crawltime.replace(microsecond=0)

        # 天气基本信息
        self.uptime = uptime # 这个时间指的是网站上显示的更新时间，不是爬取的时间
        self.city = city
        self.weather = weather
        self.temp_min = temp_min
        self.temp_max = temp_max

        # 天气详细信息
        self.wind_dir = wind_dir
        self.wind_min = wind_min
        self.wind_max = wind_max

        # 空气质量
        self.aqi = aqi
        self.aqi_level = aqi_level
        
        # 天气其他信息，比如高温预警、雷电预警之类的
        self.other_info = other_info

    # 便于保存成文件什么的
    def get_weather_info_list(self):
        return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp_min, self.temp_max,
        self.wind_dir, self.wind_min, self.wind_max, self.aqi, self.aqi_level, self.other_info]
    
    def get_weather_info_str(self):
        return [str(self.crawltime), str(self.uptime), str(self.city),
        str(self.weather), str(self.temp_min), str(self.temp_max),
        str(self.wind_dir), str(self.wind_min), str(self.wind_max),
        str(self.aqi), str(self.aqi_level), str(self.other_info)]
    
    #全部转换成纯字符串
    def __str__(self):
        return "|".join(self.get_weather_info_str()).replace("None", "")

# ---------------- 函数定义 ----------------

# 天气爬虫
def weather_spider(weather_info, weather_forecast_info, url):
    if (DEBUG_MODE):
        # 本地测试文件
        with open("yanta.html", 'r', encoding='utf-8') as html_file:
            html_text = html_file.read()
        soup = BeautifulSoup(html_text, "lxml")
    else:
        try:
            web_data = requests.get(url)
        except requests.exceptions.ConnectionError:
            print("网络连接异常")
            return
        except Exception:
            print("其他异常")
            return

        if (web_data.status_code != requests.codes.ok):
            print("服务器响应异常", web_data.status_code)
            return

        soup = BeautifulSoup(web_data.text, "lxml")
    

    # 城市信息
    city_tag = soup.select(".search_default > em")
    city_name = city_tag[0].get_text().replace(' ', '')
    weather_info.city = city_name # 保存城市信息

    # 发布时间
    uptime_tag = soup.find("strong", class_="info_uptime")
    uptime_str = uptime_tag.get_text()
    uptime_str = re.search(r"(\d+):(\d+)", uptime_str).groups() # 用正则提取时分
    tmp_datetime = datetime.datetime.now()
    uptime = tmp_datetime.replace(hour=int(uptime_str[0]), minute=int(uptime_str[1]),
                                    second=0, microsecond=0)
    weather_info.uptime = uptime

    # 获取描述信息
    description_tag = soup.select('meta[name="description"]')

    # 用tag的get方法获取指定属性的值
    weather_description = description_tag[0].get("content")
    weather_description.replace(" ", "").replace(",", "，") # 清除多余空格，替换英文标点符号
    weather_info.description = weather_description

    # 抓取实时天气信息
    tmp_tag = soup.find("div", class_="wea_weather clearfix")
    weather_info.temp = float(tmp_tag.em.string) # em标签中是温度
    weather_info.weather = tmp_tag.b.string # b标签中是天气

    # 这个标签下是湿度和风力信息
    tmp_tag = soup.find("div", class_="wea_about clearfix")

    humi_str = tmp_tag.span.string # span标签下是湿度
    humi_value = re.search(r"(\d+)", humi_str).group() 
    weather_info.humi = int(humi_value)

    wind_str = tmp_tag.em.string # span标签下是风力
    try:
        wind_dir = re.search(r"([东西南北微无台]+风)", wind_str).group()
    except AttributeError:
        print("可能是风力等级有新的汉字，原始数据：", wind_str)
    else:
        weather_info.wind_dir = wind_dir

    wind_min = re.search(r"(\d+)", wind_str).group()
    weather_info.wind_min = wind_min

    tmp_tag = soup.find("div", class_="wea_tips clearfix")
    weather_info.tips = tmp_tag.em.string 

    # 3天预报爬虫
    tmp_tag = soup.find_all("ul", class_ = "days clearfix")
    day_num = 0
    for oneday in tmp_tag:
        items_tag = oneday.find_all("li")

        # 城市和更新时间前面已经爬了，这里直接用
        weather_forecast_info[day_num].uptime = weather_info.uptime
        weather_forecast_info[day_num].city = weather_info.city

        # 0 今天、明天、后天
        # 略，不爬

        # 1 天气
        weather_forecast_info[day_num].weather = re.sub(r"\s", "", items_tag[1].get_text())
        
        # 2 温度
        temps = re.findall(r"(\d+)", items_tag[2].get_text())
        weather_forecast_info[day_num].temp_min = int(temps[0])
        weather_forecast_info[day_num].temp_max = int(temps[1])
        
        # 3 风力
        weather_forecast_info[day_num].wind_dir = items_tag[3].em.get_text() # 风向
        wind_values = re.findall(r"(\d+)", items_tag[3].b.get_text())  # 风力
        if (len(wind_values) == 1):
            weather_forecast_info[day_num].wind_min = int(wind_values[0])
        elif (len(wind_values) == 2):
            weather_forecast_info[day_num].wind_min = int(wind_values[0])
            weather_forecast_info[day_num].wind_max = int(wind_values[1])
        else:
            print("解析风力级数出错")

        # 4 空气质量
        aqi_str = re.sub(r"\s", "", items_tag[4].get_text())
        weather_forecast_info[day_num].aqi = int(re.search(r"(\d+)", aqi_str).group())
        weather_forecast_info[day_num].aqi_level = re.search(r"(\D+)", aqi_str).group()

        day_num += 1


# 空气污染爬虫
def aqi_spider(weather_info, url):
    if (DEBUG_MODE):
    # 本地测试文件
        with open("air.html", 'r', encoding='utf-8') as html_file:
            html_text = html_file.read()
        soup = BeautifulSoup(html_text, "lxml")
    else:
        try:
            web_data = requests.get(url)
        except requests.exceptions.ConnectionError:
            print("网络连接异常")
            return
        except Exception:
            print("其他异常")
            return

        if (web_data.status_code != requests.codes.ok):
            print("服务器响应异常", web_data.status_code)
            return
        
        soup = BeautifulSoup(web_data.text, "lxml")


    # 通过id查找空气质量和等级
    aqi_value_tag = soup.select("#aqi_value")
    weather_info.aqi = int(aqi_value_tag[0].get_text())

    aqi_desc_tag = soup.select("#aqi_desc")
    weather_info.aqi_level = aqi_desc_tag[0].get_text()

    # 爬取空气质量详情
    aqi_info_item = soup.find("ul", class_="clearfix")

    # 获取空气质量条目的名称
    aqi_info_name_list = []
    aqi_info_name_tag = aqi_info_item.find_all("em")
    for name in aqi_info_name_tag:
        aqi_info_name_list.append("".join(name.strings))

    # 获取空气质量条目的数值
    aqi_info_value_list = []
    aqi_info_value_tag = aqi_info_item.find_all("span")
    for value in aqi_info_value_tag:
        aqi_info_value_list.append(int(value.string))

    # 空气质量合并为字典
    aqi_info_list = dict(zip(aqi_info_name_list, aqi_info_value_list))
    # print(aqi_info_list)

    # 保存
    weather_info.aqi_PM10 = aqi_info_list["PM10"]
    weather_info.aqi_PM2P5 = aqi_info_list["PM2.5"]
    weather_info.aqi_NO2 = aqi_info_list["NO2"]
    weather_info.aqi_SO2 = aqi_info_list["SO2"]
    weather_info.aqi_O3 = aqi_info_list["O3"]
    weather_info.aqi_CO = aqi_info_list["CO"]

    # 发布时间
    # 这个发布时间我感觉严重滞后，就不用了
    # aqi_info_time_raw = soup.find(class_="aqi_info_time")
    # aqi_info_time = aqi_info_time_raw.b.string
    # # print(aqi_info_time)
    # uptime = re.search(r"(\d+)年(\d+)月(\d+)日 (\d+)时(\d+)分", aqi_info_time).groups()
    # print(uptime)



# 主函数
def main():
    save_path = ""

    this_path = os.path.realpath(__file__)
    dir_path = os.path.dirname(this_path)
    save_path = os.path.join(dir_path, "weather_data")
    if (DEBUG_MODE):
        save_csv_filename = os.path.join(save_path, "weather_debug.csv")
        save_txt_filename = os.path.join(save_path, "weather_debug.txt")
    else:
        save_csv_filename = os.path.join(save_path, "weather.csv")
        save_txt_filename = os.path.join(save_path, "weather.txt")
    
    if not (os.path.exists(save_path)):
        os.mkdir(save_path)

    if not (os.path.exists(save_csv_filename)):
        with open(save_csv_filename, "w", newline = "", encoding="GBK") as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(["爬取时间", "更新时间", "地区", "天气", "温度",
            "湿度", "风向", "最小风力", "最大风力", "小贴士", "描述",
            "空气质量指数", "空气质量等级", "PM10", "PM2.5", "NO2", "SO2", "O3", "CO", "其他信息"])
    
    count = 0
    while (True):
        # 爬实时天气及3天预报
        all_weather_info = weather()
        all_weather_forecast_info = [weather_forecast(), weather_forecast(), weather_forecast()]

        weather_spider(all_weather_info, all_weather_forecast_info, wea_url)
        aqi_spider(all_weather_info, aqi_url)

        print(all_weather_info.get_weather_info_list())
        for info in all_weather_forecast_info:
            print(info.get_weather_info_list())

        # 保存到csv，方便查阅历史天气
        with open(save_csv_filename, "a", newline = "", encoding="GBK") as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(all_weather_info.get_weather_info_list())

        # 保存到最新天气信息，可供其他程序使用
        with open(save_txt_filename, "w", encoding="GBK") as txt_file:
            txt_file.write(str(all_weather_info) + "\n")
            for item in all_weather_forecast_info:
                txt_file.write(str(item) + "\n")

        count += 1
        print("第%d次爬取完成，待机中……\n" %(count))
        time.sleep(2700)

    return


# ---------------- 程序入口 ----------------
if (__name__ == "__main__"):
    main()
